#' 清洗数据
#'
#'
#' @param df 数据框
#' @param continuous 是连续变量就不会问case样本量的问题
#' @param save 默认FALSE，如果是TRUE的话会保存为本地文件，文件名为id的名字
#'
#' @return 返回清洗后的数据
#' @export
#'
#' @examples
#'
#' #函数的功能是自动转换数据框的列名，不区分大小写
#'
#' #自动将"snp","rsid","rsids","snpid","rnpid","rs","variant_id"的列名改为"SNP"
#' #自动将"chr","#chrom","chromosome"的列名改为"chr"
#' #自动将"pos","position","base_pair_location"的列名改为"pos"
#' #自动将"effect_allele","ea","alt", "alts","Tested_Allele","Alternate.Allele"的列名改为"effect_allele"
#' #自动将"other_allele","oa","ref","Reference.Allele","NEA"的列名改为"other_allele"
#' #自动将"beta","b","Effect","LogOR"的列名改为"beta"
#' #自动将"se","sebeta","standard error","standard_error","StdErr","StdErrLogOR"的列名改为"se"
#' #自动将"pval","p","p_value","pvalue"的列名改为"pval"
#' #自动将"z","zscore"的列名改为"z"
#' #自动将"eaf","FREQ", "af_alt", "FREQ1","effect_allele_frequency","Freq_Tested_Allele",
#' #  "Alternate.Allele.Frequency"的列名改为"eaf"
#' #自动将"samplesize","n","sample_size","TotalSampleSize"的列名改为"samplesize"
#' #自动将"ncase","n_cases", "ncases", "n_case"的列名改为"ncase"
#' #自动将"ncontrol","n_controls","ncontrols", "n_control","Ntotal" 的列名改为"ncontrol"
#' #自动将"or","odds ratio","odds_ratio"的列名改为"or"
#'
#' # 如果没有提供Phenotype，id，样本量等， 则要求填写。
#'
#'
#' # 还需要找到数据的样本量填进去，因为R2的计算需要样本量，如果是病例对照研究，需要case数和control数
#'
#'
#'
#' # 设置随机数种子
#' set.seed(123)
#'
#' # 随机生成df
#' df <- data.frame(
#'  rsid = paste0("rs", sample(1:10000, 1000, replace = TRUE)),
#'  CHr = sample(1:22, 1000, replace = TRUE),
#'  position = sample(1:100000, 1000, replace = TRUE),
#'  ea = sample(c("c","t","a","g"), 1000, replace = TRUE),
#'  oa = sample(c("c","t","a","g"), 1000, replace = TRUE),
#'  FREQ = runif(1000, 0, 1),
#'  B = runif(1000, 0, 1),
#'  SE = runif(1000, 0, 1),
#'  P = runif(1000, 0, 1),
#'  n_cases = rep(1000,1000),
#'  ncontrols = rep(3000,1000),
#'  Phenotype = rep("BMI",1000),
#'  id = rep("id1-bmi",1000)
#' )
#'
#' # 在使用这个函数之前注意自己的列名是否在上面的自动里面，
#' # 如果没有，就可以用dplyr的rename函数单个的改列名
#' df <- dplyr::rename(df, chr = CHr)
#'
#' # 查看前几行
#' head(df)
#'
#' df <- U1_Clean_data(df)
#'
#' # 手动导出为parquet文件，也可以在U1_Clean_data中用save=T导出文件
#' \dontrun{
#'   arrow::write_parquet(df,"id1-bmi.parquet", compression = "gzip")
#' }
#'
#' # 如果没有eaf可以用maf代替，不影响R2的计算
#'
#'
#'

U1_Clean_data <- function(df,continuous=FALSE,save=FALSE){
  if(nrow(df)<=100000){
    warning('SNP个数太少,有可能不能作为结局分析')
  }
  df<- Clean_data(df=df,continuous=continuous)
  class(df)<- c("df" , class(df) )

  if(save){
    arrow::write_parquet(df, paste0(df$id[1], ".parquet" ) , compression = "gzip")
  }

  return(df)
}


Clean_data = function(df,continuous=FALSE){

    rename_col <- function(df ,
                           patterns ,
                           format ){
      B <- colnames( df )

      for(i in 1:length(B)){
        col <- B[i]
        for( y in 1:length(patterns) ){
          if( tolower(patterns[y]) == tolower(B[i])){

            if(!B[i]==format){cat("将",B[i],"替换为", format,"\n")}

            B[i] <- format

          }
        }
      }

      colnames( df ) <-  B

      if(length( unique(colnames(df) ) ) != length(colnames( df))){
        stop(paste(format,"列名重复，请用`help(U1_Clean_data)`查看说明文档，只提供一个!\n"), call. = FALSE)
      }

      return(df)
    }

    # 只要是数据框都转换为数据框
    if(is.data.frame(df)){ df = as.data.frame(df) }


    df<-rename_col(df,patterns=c("snp","rsid","rsids","snpid","rnpid","rs","variant_id"),format="SNP")
    df<-rename_col(df,patterns=c("chr","#chrom","chromosome"),format="chr")
    df<-rename_col(df,patterns=c("pos","position","base_pair_location"),format="pos")
    df<-rename_col(df,patterns=c("effect_allele","ea","alt", "alts","Tested_Allele","Alternate.Allele"),format="effect_allele")
    df<-rename_col(df,patterns=c("other_allele","oa","ref","Reference.Allele","NEA"),format="other_allele")
    df<-rename_col(df,patterns=c("beta","b","Effect","LogOR"),format="beta")
    df<-rename_col(df,patterns=c("se","sebeta","standard error","standard_error","StdErr","StdErrLogOR"),format="se")
    df<-rename_col(df,patterns=c("pval","p","p_value","pvalue"),format="pval")
    df<-rename_col(df,patterns=c("z","zscore"),format="z")
    df<-rename_col(df,patterns=c("eaf","FREQ", "af_alt", "FREQ1","effect_allele_frequency","Freq_Tested_Allele","Alternate.Allele.Frequency"),format="eaf")
    df<-rename_col(df,patterns=c("samplesize","n","sample_size","TotalSampleSize"),format="samplesize")
    df<-rename_col(df,patterns=c("ncase","n_cases", "ncases", "n_case"),format="ncase")
    df<-rename_col(df,patterns=c("ncontrol","n_controls","ncontrols", "n_control","Ntotal" ),format="ncontrol")
    df<-rename_col(df,patterns=c("or","odds ratio","odds_ratio"),format="or")
    df<-rename_col(df,patterns=c("gene","nearest_genes"),format="gene")
    df<-rename_col(df,patterns=c("info"),format="info")


    if(!"SNP" %in% colnames( df ) ){
      warning("SNP非常重要，请用",cli::style_underline(cli::col_br_red("help(U1_Clean_data)")),"查看说明文档，提供一个SNP列! \n")
    }else{
      ratio = sum(grepl("^rs\\d+$", df$SNP))/length(df$SNP)
      if( ratio < 0.5 ){ warning("大量或全部的SNP不是rs123456这种格式，请用",cli::style_underline(cli::col_br_red("help(U1_add_SNP)")),"查看说明文档，匹配SNP列！\n")  }
    }

    if(!"chr" %in% colnames( df ) | !"pos" %in% colnames( df ) ){
      cat("没有chr或pos，请用",cli::style_underline(cli::col_br_red("help(U1_Clean_data)")),"查看说明文档，提供一个chr或pos列! 或者用",cli::style_underline(cli::col_br_red("help(U1_add_chr_pos)")),"查看说明文档进行匹配chr和pos，实在没有也不影响分析 \n")
    }

    if( "pos" %in% colnames(df) ){
      df$pos<-as.numeric( df$pos ) }


    # beta部分
    if( "or" %in% colnames( df ) & !"beta" %in% colnames( df ) ){
      cat("没有提供beta，根据or计算，公式为beta = log(or) \n")
      df$beta<-as.numeric(log(df$or))
    }else if("se" %in% colnames( df ) &  "pval" %in% colnames( df ) & !"beta" %in% colnames( df )  ){
      cat("没有提供beta，根据se和pval计算，公式为beta = se * sqrt(qchisq(pval,1,lower.tail=F) ) \n")
      df$beta<-as.numeric(  df$se * sqrt(qchisq(df$pval,1,lower.tail=F) )         )
    }else if(!"beta" %in% colnames( df )){
      warning("beta非常重要，请用",cli::style_underline(cli::col_br_red("help(U1_Clean_data)")),"查看说明文档，提供一个beta列! \n")
    }

    # se部分
    if("beta" %in% colnames( df ) &  "pval" %in% colnames( df ) & !"se" %in% colnames( df )){
      cat("没有提供se，根据beta和pval计算，公式为se = beta / sqrt(qchisq(pval,1,lower.tail=F) ) \n")
      df$se<-as.numeric(  df$beta / sqrt(qchisq(df$pval,1,lower.tail=F) )         )
    }else if(!"se" %in% colnames( df )){
      warning("se非常重要，请用",cli::style_underline(cli::col_br_red("help(U1_Clean_data)")),"查看说明文档，提供一个se列! \n")
    }

    if(!"effect_allele" %in% colnames( df ) ){
      warning("effect_allele非常重要，请用",cli::style_underline(cli::col_br_red("help(U1_Clean_data)")),"查看说明文档，提供一个effect_allele列! \n")
    }

    if(!"other_allele" %in% colnames( df ) ){
      warning("other_allele非常重要，请用",cli::style_underline(cli::col_br_red("help(U1_Clean_data)")),"查看说明文档，提供一个other_allele列! \n")
    }


    if( "pval" %in% colnames(df) ){
      if (!is.numeric(df$pval)){
        cat("将pval转换为数值\n")
        df$pval<-as.numeric(df$pval) }
    }else if(("beta" %in% colnames(df)) & ("se" %in% colnames(df))  & !("pval" %in% colnames(df))   ){
      cat("没有提供pval，根据beta，se计算，公式为pval<- 2*pnorm(abs(beta/se),lower.tail=FALSE) \n")
      df$beta<-as.numeric(df$beta)
      df$se<-as.numeric(df$se)
      df$pval<- 2*pnorm(abs(df$beta/df$se),lower.tail=FALSE)
      df$pval<-as.numeric(df$pval)
    }else if( "z" %in% colnames(df)  & !("pval" %in% colnames(df)) ){
      cat("没有提供pval,也不能根据beta和se计算，于是根据z计算，公式为pval<- 2*pnorm(abs(z),lower.tail=FALSE) \n")
      df$z<-as.numeric(df$z)
      df$pval<- 2*pnorm(abs(df$z),lower.tail=FALSE)
      df$pval<-as.numeric(df$pval)
    }else if(!("pval" %in% colnames(df))){
      warning("pval非常重要，请用",cli::style_underline(cli::col_br_red("help(U1_Clean_data)")),"查看说明文档，提供一个pval列! \n")
    }

    # if( !("z" %in% colnames(df)) & ("pval" %in% colnames(df)) ){
    #   cat("没有提供z，根据pval计算，公式为 z <- sqrt(qchisq(pval,1,lower.tail=F)) \n")
    #   df$z<- sqrt(qchisq(pval,1,lower.tail=F))
    # }

    if(!"Phenotype" %in% colnames(df) ){
      df$Phenotype<- readline("输入表型名称，例如Body Mass Index，然后按Enter:")
    }

    if(!"id" %in% colnames(df) ){
      df$id<- readline("为您的表型制作一个唯一的ID，例如Oneclick-0001，然后按Enter:")
    }

    if(!continuous){
    if( (!"ncase" %in% colnames(df)) |  (!"ncontrol" %in% colnames(df))       ){
      YES_NO<- yesno::yesno2("是病例对照数据吗，例如疾病有病例和对照?是选1,不是选2", yes = "是的", no = "否")
      if( (!"ncase" %in% colnames(df)) & YES_NO    ){df$ncase<- as.numeric( readline("输入病例数，例如4560，然后按Enter:")  )}
      if( (!"ncontrol" %in% colnames(df)) & YES_NO  ){ df$ncontrol<- as.numeric( readline("输入对照数，例如78200，然后按Enter:")  )}
    }
    }
    if(("ncase" %in% colnames(df)) & ("ncontrol" %in% colnames(df)) & !"samplesize" %in% colnames(df) ){
      cat("没有提供samplesize，根据ncase和ncontrol计算，公式为samplesize<- ncase + ncontrol \n")
      df$samplesize<- as.numeric(df$ncase + df$ncontrol  )
    }else if( !"samplesize" %in% colnames(df) ){
      df$samplesize<- as.numeric( readline("输入总样本量大小，例如478000，然后按Enter:")  )
    }

    return(df)

}


